try:
#for data processing, data visualization preparation, data analysis
import pandas as pd
import numpy as np
#for data profiling
from ydata_profiling import ProfileReport
#for data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#for missing value visualization
import missingno as msno
#for data processing/manipulation
import pyspark
from pyspark.sql import SparkSession
#for joining path to folder
import os
import datetime
except ImportError:
!pip install pandas
!pip install ydata_profiling
!pip install numpy
!pip install os
!pip install matplotlib.pyplot
!pip install seaborn
!pip install missingno
!pip install pyspark
import pandas
import ydata_profiling
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
folder_path = r'/Users/torresliu/Desktop/Perpay/drive-download-20230516T193115Z-001'
user_file_path = os.path.join(folder_path, 'user_dataset.csv')
loan_file_path = os.path.join(folder_path, 'loan_dataset.csv')
user_df = pd.read_csv(user_file_path)
loan_df = pd.read_csv(loan_file_path)
user_df
| user_id | signup_dt | company_name | spending_limit_est | valid_phone_ind | last_login | was_referred_ind | first_paystub_dt | first_application_start_ts | first_application_complete_ts | first_awaiting_payment_ts | first_repayment_ts | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc | 2020-10-13 00:06:21.603537 | Other > Add a new company | 1000.0 | 1 | 2022-11-22 01:21:08.832662 | 0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 2752a3f2-0a96-42f9-bcc3-5fde7edec06b | 2020-10-13 00:12:20.590843 | Other > Add a new company | 700.0 | 1 | 2020-12-31 01:50:17.564992 | 0 | NaN | 2020-12-31 01:50:18.097958 | NaN | NaN | NaN |
| 2 | 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 | 2020-10-13 00:15:26.586054 | NaN | NaN | 0 | 2020-10-13 00:15:27.200942 | 0 | NaN | NaN | NaN | NaN | NaN |
| 3 | 0067c279-3b94-4867-ae51-aa182cfc8dc3 | 2020-10-13 00:17:16.041499 | Other > Add a new company | NaN | 1 | 2020-10-27 15:27:35.952604 | 0 | NaN | NaN | NaN | NaN | NaN |
| 4 | 972e17fd-95b5-4bcc-8f17-c4323abd6c7b | 2020-10-13 00:21:46.310764 | NaN | NaN | 0 | 2020-10-13 00:21:46.398749 | 0 | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95570 | 9d562896-402f-43fb-b2e8-95674b59c00c | 2020-10-19 00:03:58.651384 | Vishay | 1000.0 | 1 | 2023-01-16 18:52:38.283396 | 0 | NaN | 2022-11-20 16:59:13.045054 | 2022-11-20 17:02:13.393381 | 2022-11-20 17:02:15.975683 | 2022-11-23 13:49:14.122587 |
| 95571 | 2db26318-1e86-4611-8343-f22c2f52f700 | 2020-10-27 17:36:53.948341 | Anthem, Inc. | 800.0 | 1 | 2023-01-16 19:02:09.110595 | 0 | 2020-11-02 20:37:56.130803 | 2020-11-01 23:23:57.524518 | 2020-11-01 23:25:14.11145 | 2020-11-02 20:39:23.167071 | 2021-09-03 14:12:56.800775 |
| 95572 | 19444a27-e573-4b06-9ebf-f49a80cb27ba | 2020-11-05 13:43:05.890775 | Travelers Insurance | 1000.0 | 1 | 2023-01-15 11:35:48.447004 | 0 | 2021-07-13 23:44:08.466518 | 2022-02-04 16:25:05.34794 | 2022-03-22 22:05:36.403078 | 2022-03-22 22:05:39.51998 | 2022-04-08 13:48:49.29766 |
| 95573 | b2cf7001-2145-4653-a58f-26f658d6fed0 | 2020-11-01 22:49:39.533347 | United States Army (Civilians) | 1000.0 | 1 | 2022-12-19 13:58:21.389337 | 0 | 2020-11-20 23:54:27.624186 | 2020-11-19 19:16:24.802811 | 2020-11-20 23:45:58.574113 | 2020-11-23 14:18:21.049638 | NaN |
| 95574 | a9b1e320-847f-438f-b9c6-d3b374810d6c | 2020-10-24 15:36:18.286066 | Indian Health Service | 1000.0 | 1 | 2023-01-15 20:36:10.793236 | 0 | NaN | 2023-01-15 20:15:18.820641 | 2023-01-15 20:17:26.570373 | 2023-01-15 20:17:33.152737 | 2023-01-15 20:28:19.813885 |
95575 rows × 12 columns
user_df.head()
| user_id | signup_dt | company_name | spending_limit_est | valid_phone_ind | last_login | was_referred_ind | first_paystub_dt | first_application_start_ts | first_application_complete_ts | first_awaiting_payment_ts | first_repayment_ts | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc | 2020-10-13 00:06:21.603537 | Other > Add a new company | 1000.0 | 1 | 2022-11-22 01:21:08.832662 | 0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 2752a3f2-0a96-42f9-bcc3-5fde7edec06b | 2020-10-13 00:12:20.590843 | Other > Add a new company | 700.0 | 1 | 2020-12-31 01:50:17.564992 | 0 | NaN | 2020-12-31 01:50:18.097958 | NaN | NaN | NaN |
| 2 | 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 | 2020-10-13 00:15:26.586054 | NaN | NaN | 0 | 2020-10-13 00:15:27.200942 | 0 | NaN | NaN | NaN | NaN | NaN |
| 3 | 0067c279-3b94-4867-ae51-aa182cfc8dc3 | 2020-10-13 00:17:16.041499 | Other > Add a new company | NaN | 1 | 2020-10-27 15:27:35.952604 | 0 | NaN | NaN | NaN | NaN | NaN |
| 4 | 972e17fd-95b5-4bcc-8f17-c4323abd6c7b | 2020-10-13 00:21:46.310764 | NaN | NaN | 0 | 2020-10-13 00:21:46.398749 | 0 | NaN | NaN | NaN | NaN | NaN |
user_df
| user_id | signup_dt | company_name | spending_limit_est | valid_phone_ind | last_login | was_referred_ind | first_paystub_dt | first_application_start_ts | first_application_complete_ts | first_awaiting_payment_ts | first_repayment_ts | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc | 2020-10-13 00:06:21.603537 | Other > Add a new company | 1000.0 | 1 | 2022-11-22 01:21:08.832662 | 0 | NaN | NaN | NaN | NaN | NaN |
| 1 | 2752a3f2-0a96-42f9-bcc3-5fde7edec06b | 2020-10-13 00:12:20.590843 | Other > Add a new company | 700.0 | 1 | 2020-12-31 01:50:17.564992 | 0 | NaN | 2020-12-31 01:50:18.097958 | NaN | NaN | NaN |
| 2 | 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 | 2020-10-13 00:15:26.586054 | NaN | NaN | 0 | 2020-10-13 00:15:27.200942 | 0 | NaN | NaN | NaN | NaN | NaN |
| 3 | 0067c279-3b94-4867-ae51-aa182cfc8dc3 | 2020-10-13 00:17:16.041499 | Other > Add a new company | NaN | 1 | 2020-10-27 15:27:35.952604 | 0 | NaN | NaN | NaN | NaN | NaN |
| 4 | 972e17fd-95b5-4bcc-8f17-c4323abd6c7b | 2020-10-13 00:21:46.310764 | NaN | NaN | 0 | 2020-10-13 00:21:46.398749 | 0 | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95570 | 9d562896-402f-43fb-b2e8-95674b59c00c | 2020-10-19 00:03:58.651384 | Vishay | 1000.0 | 1 | 2023-01-16 18:52:38.283396 | 0 | NaN | 2022-11-20 16:59:13.045054 | 2022-11-20 17:02:13.393381 | 2022-11-20 17:02:15.975683 | 2022-11-23 13:49:14.122587 |
| 95571 | 2db26318-1e86-4611-8343-f22c2f52f700 | 2020-10-27 17:36:53.948341 | Anthem, Inc. | 800.0 | 1 | 2023-01-16 19:02:09.110595 | 0 | 2020-11-02 20:37:56.130803 | 2020-11-01 23:23:57.524518 | 2020-11-01 23:25:14.11145 | 2020-11-02 20:39:23.167071 | 2021-09-03 14:12:56.800775 |
| 95572 | 19444a27-e573-4b06-9ebf-f49a80cb27ba | 2020-11-05 13:43:05.890775 | Travelers Insurance | 1000.0 | 1 | 2023-01-15 11:35:48.447004 | 0 | 2021-07-13 23:44:08.466518 | 2022-02-04 16:25:05.34794 | 2022-03-22 22:05:36.403078 | 2022-03-22 22:05:39.51998 | 2022-04-08 13:48:49.29766 |
| 95573 | b2cf7001-2145-4653-a58f-26f658d6fed0 | 2020-11-01 22:49:39.533347 | United States Army (Civilians) | 1000.0 | 1 | 2022-12-19 13:58:21.389337 | 0 | 2020-11-20 23:54:27.624186 | 2020-11-19 19:16:24.802811 | 2020-11-20 23:45:58.574113 | 2020-11-23 14:18:21.049638 | NaN |
| 95574 | a9b1e320-847f-438f-b9c6-d3b374810d6c | 2020-10-24 15:36:18.286066 | Indian Health Service | 1000.0 | 1 | 2023-01-15 20:36:10.793236 | 0 | NaN | 2023-01-15 20:15:18.820641 | 2023-01-15 20:17:26.570373 | 2023-01-15 20:17:33.152737 | 2023-01-15 20:28:19.813885 |
95575 rows × 12 columns
#length of the user dataset.
len(user_df)
95575
# user data set column types
user_df.dtypes
user_id object signup_dt object company_name object spending_limit_est float64 valid_phone_ind int64 last_login object was_referred_ind int64 first_paystub_dt object first_application_start_ts object first_application_complete_ts object first_awaiting_payment_ts object first_repayment_ts object dtype: object
# Convert the column to datetimein a loop
datetime_columns = [
'signup_dt',
'last_login',
'first_paystub_dt',
'first_application_start_ts',
'first_application_complete_ts',
'first_awaiting_payment_ts',
'first_repayment_ts'
]
for column in datetime_columns:
user_df[column] = pd.to_datetime(user_df[column])
user_df.dtypes
user_id object signup_dt datetime64[ns] company_name object spending_limit_est float64 valid_phone_ind int64 last_login datetime64[ns] was_referred_ind int64 first_paystub_dt datetime64[ns] first_application_start_ts datetime64[ns] first_application_complete_ts datetime64[ns] first_awaiting_payment_ts datetime64[ns] first_repayment_ts datetime64[ns] dtype: object
# data dimension
user_df.shape
(95575, 12)
# describe statistical information
user_df.describe()
#no user_id, no company name
| signup_dt | spending_limit_est | valid_phone_ind | last_login | was_referred_ind | first_paystub_dt | first_application_start_ts | first_application_complete_ts | first_awaiting_payment_ts | first_repayment_ts | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 95575 | 69055.000000 | 95575.000000 | 95575 | 95575.000000 | 8839 | 25105 | 18948 | 14388 | 1524 |
| mean | 2020-10-26 07:30:46.781417984 | 837.165216 | 0.813947 | 2021-03-01 14:29:40.952891136 | 0.035177 | 2021-02-20 00:47:41.967247872 | 2021-01-01 01:58:32.123276800 | 2021-01-09 17:34:31.836468224 | 2021-01-24 12:24:51.778903552 | 2021-04-23 19:11:41.864473856 |
| min | 2020-10-13 00:00:13.593771 | 50.000000 | 0.000000 | 2020-10-13 00:01:32.317156 | 0.000000 | 2016-11-12 00:40:01.316014 | 2016-11-12 00:22:57.724906 | 2016-11-12 00:40:24.715791 | 2016-11-12 18:17:14.283158 | 2016-11-30 21:17:45.514341 |
| 25% | 2020-10-20 06:54:35.580967424 | 650.000000 | 1.000000 | 2020-10-24 22:18:45.041860864 | 0.000000 | 2020-10-26 18:07:49.435676928 | 2020-10-24 03:36:48.439672064 | 2020-10-24 23:20:00.373304832 | 2020-10-26 00:51:18.425989888 | 2020-11-13 15:04:34.914781440 |
| 50% | 2020-10-27 01:36:18.374267904 | 1000.000000 | 1.000000 | 2020-11-02 08:15:36.039156992 | 0.000000 | 2020-11-05 02:37:25.778889984 | 2020-11-01 01:38:02.628940032 | 2020-11-02 02:40:46.823124992 | 2020-11-03 20:59:17.371486464 | 2020-12-04 11:47:32.950610688 |
| 75% | 2020-11-01 05:30:42.165230592 | 1000.000000 | 1.000000 | 2021-01-10 07:02:38.536221952 | 0.000000 | 2021-01-17 04:38:59.484386560 | 2020-11-19 00:52:01.452822016 | 2020-11-30 19:18:59.148196608 | 2020-12-13 18:41:46.900191232 | 2021-09-17 14:33:02.806388992 |
| max | 2020-11-06 23:59:49.199581 | 1000.000000 | 1.000000 | 2023-01-16 19:26:10.352780 | 1.000000 | 2023-01-12 07:15:04.604709 | 2023-01-16 13:38:13.593884 | 2023-01-16 13:39:30.223596 | 2023-01-16 13:39:32.495406 | 2023-01-15 20:28:19.813885 |
| std | NaN | 238.771310 | 0.389151 | NaN | 0.184227 | NaN | NaN | NaN | NaN | NaN |
# data information
user_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 95575 entries, 0 to 95574 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 95575 non-null object 1 signup_dt 95575 non-null datetime64[ns] 2 company_name 86027 non-null object 3 spending_limit_est 69055 non-null float64 4 valid_phone_ind 95575 non-null int64 5 last_login 95575 non-null datetime64[ns] 6 was_referred_ind 95575 non-null int64 7 first_paystub_dt 8839 non-null datetime64[ns] 8 first_application_start_ts 25105 non-null datetime64[ns] 9 first_application_complete_ts 18948 non-null datetime64[ns] 10 first_awaiting_payment_ts 14388 non-null datetime64[ns] 11 first_repayment_ts 1524 non-null datetime64[ns] dtypes: datetime64[ns](7), float64(1), int64(2), object(2) memory usage: 8.8+ MB
null_df = pd.DataFrame({"Null Values": user_df.isnull().sum(),
"Percentage Null Values": (user_df.isnull().sum()) / (user_df.shape[0]) * 100
})
null_df
| Null Values | Percentage Null Values | |
|---|---|---|
| user_id | 0 | 0.000000 |
| signup_dt | 0 | 0.000000 |
| company_name | 9548 | 9.990060 |
| spending_limit_est | 26520 | 27.747842 |
| valid_phone_ind | 0 | 0.000000 |
| last_login | 0 | 0.000000 |
| was_referred_ind | 0 | 0.000000 |
| first_paystub_dt | 86736 | 90.751766 |
| first_application_start_ts | 70470 | 73.732671 |
| first_application_complete_ts | 76627 | 80.174732 |
| first_awaiting_payment_ts | 81187 | 84.945854 |
| first_repayment_ts | 94051 | 98.405441 |
user_df.columns
Index(['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
'valid_phone_ind', 'last_login', 'was_referred_ind', 'first_paystub_dt',
'first_application_start_ts', 'first_application_complete_ts',
'first_awaiting_payment_ts', 'first_repayment_ts'],
dtype='object')
user_msno = user_df[['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
'valid_phone_ind', 'last_login', 'was_referred_ind',
'first_application_start_ts', 'first_application_complete_ts',
'first_awaiting_payment_ts','first_paystub_dt', 'first_repayment_ts']]
msno.bar(user_msno, color="blue")
plt.ylabel('Number of Non-null observation')
plt.show()
user_df.columns
Index(['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
'valid_phone_ind', 'last_login', 'was_referred_ind', 'first_paystub_dt',
'first_application_start_ts', 'first_application_complete_ts',
'first_awaiting_payment_ts', 'first_repayment_ts'],
dtype='object')
user_check_dup = user_df.groupby(['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
'valid_phone_ind', 'last_login', 'was_referred_ind', 'first_paystub_dt',
'first_application_start_ts', 'first_application_complete_ts',
'first_awaiting_payment_ts', 'first_repayment_ts'])['user_id'].count().reset_index(name= 'Distinct Count')
user_check_dup
| user_id | signup_dt | company_name | spending_limit_est | valid_phone_ind | last_login | was_referred_ind | first_paystub_dt | first_application_start_ts | first_application_complete_ts | first_awaiting_payment_ts | first_repayment_ts | Distinct Count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00319372-d927-4703-be31-fddddece5e2b | 2020-11-05 06:11:15.104940 | Prime Time Healthcare | 1000.0 | 1 | 2022-12-29 03:57:49.027318 | 0 | 2022-01-12 07:19:36.984471 | 2021-10-29 19:08:25.690945 | 2021-10-29 19:11:39.803046 | 2021-10-29 19:11:41.996796 | 2021-10-31 01:12:14.848954 | 1 |
| 1 | 00ddddcc-92ba-4713-b05d-13bfef9a2a05 | 2020-11-05 18:42:21.699998 | Armanino | 1000.0 | 1 | 2021-12-23 16:40:17.025239 | 0 | 2020-11-10 20:44:24.712778 | 2020-11-10 20:39:20.460429 | 2020-11-10 20:40:08.711397 | 2020-11-10 20:40:10.964056 | 2020-11-27 12:33:47.994959 | 1 |
| 2 | 00fdb33d-f087-4d44-91cf-3befd30a3695 | 2020-10-19 15:38:37.476959 | American Legion | 1000.0 | 1 | 2022-06-16 21:31:42.888787 | 0 | 2022-04-01 14:11:21.857435 | 2022-01-01 18:01:19.270900 | 2022-03-28 18:20:48.502681 | 2022-03-28 18:20:54.735573 | 2022-04-01 13:57:10.741083 | 1 |
| 3 | 0111d661-f6b7-40c7-8428-782bc1f6119a | 2020-10-28 02:15:59.849178 | Wabash National | 500.0 | 1 | 2022-09-15 07:52:11.456729 | 0 | 2020-11-17 10:39:10.340941 | 2020-10-28 02:25:12.614750 | 2020-10-28 02:26:26.883021 | 2020-10-28 02:26:28.746381 | 2020-11-17 14:31:26.991396 | 1 |
| 4 | 0146f581-0040-4788-8b59-86d4fd1c1420 | 2020-11-01 00:13:43.426700 | Paychex | 900.0 | 1 | 2023-01-06 17:02:46.054490 | 0 | 2022-04-12 14:38:50.242406 | 2022-04-12 14:31:05.766885 | 2022-04-12 14:34:10.977920 | 2022-04-14 17:46:04.790958 | 2022-05-06 13:36:09.062834 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1005 | fe462bd4-cdac-4ac6-94ae-32ad33400cbb | 2020-10-20 07:49:30.621476 | Other > Add a new company | 1000.0 | 1 | 2023-01-15 19:16:08.722549 | 0 | 2022-12-14 16:46:12.381446 | 2022-12-14 16:40:20.428949 | 2022-12-14 16:41:29.299477 | 2022-12-14 17:08:33.492203 | 2022-12-30 17:54:35.823323 | 1 |
| 1006 | fe49346f-fde7-4b8a-8e58-0eb6a4ff9299 | 2020-11-02 19:20:39.966030 | New York MTA | 1000.0 | 1 | 2021-01-27 05:22:54.315546 | 0 | 2020-11-12 23:37:25.031354 | 2020-11-02 23:30:16.385741 | 2020-11-02 23:31:08.784334 | 2020-11-02 23:31:10.276118 | 2020-12-10 14:41:31.242395 | 1 |
| 1007 | ff61dff9-88e4-4a85-85b9-2a912f156dc9 | 2020-10-21 19:13:34.506929 | Trio Home Care | 1000.0 | 1 | 2021-01-21 11:08:41.278935 | 0 | 2020-10-21 19:55:14.276958 | 2020-10-21 19:46:36.655985 | 2020-10-21 19:50:58.185742 | 2020-10-21 19:51:00.430324 | 2020-10-29 13:42:10.992583 | 1 |
| 1008 | ffa28d2d-9d27-4f9c-9485-1dd1ae86606e | 2020-10-26 04:40:22.509667 | McDonald's | 950.0 | 1 | 2022-12-27 22:14:06.405406 | 0 | 2020-10-26 19:42:59.731914 | 2022-10-20 17:28:44.160388 | 2022-10-20 17:29:57.864907 | 2022-10-20 17:30:00.311319 | 2022-11-10 18:05:19.339468 | 1 |
| 1009 | ffe974b2-6323-476a-82ca-97cbaf2c1b39 | 2020-11-02 03:19:22.098953 | Hudwell Delivery | 1000.0 | 1 | 2022-09-19 22:38:27.114806 | 0 | 2020-11-09 04:00:52.447193 | 2020-11-09 03:39:04.402847 | 2020-11-09 03:39:55.104229 | 2020-11-09 03:39:57.320469 | 2020-11-20 15:30:41.737221 | 1 |
1010 rows × 13 columns
user_check_dup[user_check_dup['Distinct Count']>1]
| user_id | signup_dt | company_name | spending_limit_est | valid_phone_ind | last_login | was_referred_ind | first_paystub_dt | first_application_start_ts | first_application_complete_ts | first_awaiting_payment_ts | first_repayment_ts | Distinct Count |
|---|
user_df.nunique()
user_id 95575 signup_dt 95575 company_name 7874 spending_limit_est 134 valid_phone_ind 2 last_login 95575 was_referred_ind 2 first_paystub_dt 8839 first_application_start_ts 25105 first_application_complete_ts 18948 first_awaiting_payment_ts 14388 first_repayment_ts 1524 dtype: int64
user_df.duplicated().sum()
0
sns.boxplot(x = 'spending_limit_est', data = user_df)
#Get the current axes
ax = plt.gca()
#Set the x-axis limit to start from 0
ax.set_xlim(left = 0)
plt.title('Spending Limit Estimate Boxplot')
Text(0.5, 1.0, 'Spending Limit Estimate Boxplot')
loan_df
| loan_id | user_id | amount | number_of_payments | user_pinwheel_eligible_at_ap | approval_type | application_start_ts | application_complete_ts | awaiting_payment_ts | repayment_ts | canceled_ts | cancellation_type | risk_tier_at_uw | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 27f95ba4-e56f-4222-bc92-d73bf8d7d669 | beca750a-2041-4c26-8154-6aa85ae9b245 | 471.48 | 8.0 | 0 | NaN | 2020-10-13 00:04:43.64499 | 2020-10-13 00:08:29.744502 | NaN | NaN | 2020-11-03 06:00:09.020005 | automated.verification | T0 |
| 1 | 040d9651-3ef2-46ed-9e28-6ad12bf00585 | 3401d86c-898b-4d6f-ba67-08b2bc005d5f | 309.98 | 8.0 | 0 | NaN | 2020-10-13 00:24:50.475201 | NaN | NaN | NaN | 2020-10-27 09:00:09.906958 | automated.application_started | T0 |
| 2 | 71a279d5-5bce-4ea2-9e98-ec9d994256b1 | ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f | 523.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-13 00:26:48.749278 | 2020-10-13 00:27:53.698868 | 2020-10-13 12:19:55.282708 | NaN | 2021-02-11 06:30:08.28502 | automated.awaiting_payment | T0 |
| 3 | 706b5235-8424-4e91-b813-328dc5603e8e | 1fa64260-49bf-474e-8800-893c0c455a06 | 369.96 | 4.0 | 0 | NaN | 2020-10-13 01:08:06.763585 | NaN | NaN | NaN | 2020-10-27 09:00:08.296637 | automated.application_started | T0 |
| 4 | d850645f-a7fe-4cd1-97db-8f1ac1364a67 | 90f815e9-c931-4d9e-aa79-f136bc5f227c | 479.98 | 8.0 | 0 | NaN | 2020-10-13 01:10:40.530583 | NaN | NaN | NaN | 2020-10-27 09:00:08.090965 | automated.application_started | T0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39007 | c538afec-eabb-442c-b53f-0d8163ba679c | ff61dff9-88e4-4a85-85b9-2a912f156dc9 | 887.89 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-24 04:55:58.074156 | 2020-10-24 04:56:44.713389 | 2020-10-24 04:56:45.707634 | 2020-10-29 13:42:10.992583 | NaN | NaN | T0 |
| 39008 | 8b6b254d-4a62-4579-b640-6a17d61cc76e | 880fb13c-70dc-4604-9199-6e158f7c4eda | 847.63 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-10-27 16:55:35.479453 | 2020-10-27 16:55:59.061944 | 2020-10-27 16:55:59.529787 | 2020-11-06 17:03:15.729864 | NaN | NaN | T0 |
| 39009 | 5e1b86e3-1f17-4413-9551-5207de5b29c5 | fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 | 304.98 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-30 06:14:34.776961 | 2020-10-30 06:15:24.078435 | 2020-10-30 06:15:25.64894 | 2020-10-30 14:36:51.175908 | NaN | NaN | T0 |
| 39010 | 601ee268-b2da-41ea-9470-0c2574e7410c | 54612ae8-64d4-426e-8c22-57d2791a0b86 | 478.96 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-11-04 01:13:43.302878 | 2020-11-04 01:14:35.764894 | 2020-11-04 01:14:37.253225 | 2020-11-13 15:05:15.799022 | NaN | NaN | T0 |
| 39011 | da8f8586-6ac5-40c5-bdda-8e570151f35f | 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 | 647.94 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-11-06 02:20:08.042178 | 2020-11-06 02:21:09.263684 | 2020-11-06 02:21:11.054734 | 2020-11-20 15:25:37.351017 | NaN | NaN | T0 |
39012 rows × 13 columns
string = "This is displayed in your Big Black Console"
type(string)
str
loan_df.dtypes
loan_id object user_id object amount float64 number_of_payments float64 user_pinwheel_eligible_at_ap int64 approval_type object application_start_ts object application_complete_ts object awaiting_payment_ts object repayment_ts object canceled_ts object cancellation_type object risk_tier_at_uw object dtype: object
# Convert the columns to datetime in a loop
datetime_columns = [
'application_start_ts',
'application_complete_ts',
'awaiting_payment_ts',
'repayment_ts',
'repayment_ts',
'canceled_ts'
]
for column in datetime_columns:
loan_df[column] = pd.to_datetime(loan_df[column])
# Convert the columns to string in a loop
datetime_columns = [
'approval_type',
'cancellation_type',
]
for column in datetime_columns:
loan_df[column] = loan_df[column].astype(str)
loan_df.dtypes
loan_id object user_id object amount float64 number_of_payments float64 user_pinwheel_eligible_at_ap int64 approval_type object application_start_ts datetime64[ns] application_complete_ts datetime64[ns] awaiting_payment_ts datetime64[ns] repayment_ts datetime64[ns] canceled_ts datetime64[ns] cancellation_type object risk_tier_at_uw object dtype: object
# data dimension
loan_df.shape #row less than user data set
(39012, 13)
loan_df
| loan_id | user_id | amount | number_of_payments | user_pinwheel_eligible_at_ap | approval_type | application_start_ts | application_complete_ts | awaiting_payment_ts | repayment_ts | canceled_ts | cancellation_type | risk_tier_at_uw | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 27f95ba4-e56f-4222-bc92-d73bf8d7d669 | beca750a-2041-4c26-8154-6aa85ae9b245 | 471.48 | 8.0 | 0 | nan | 2020-10-13 00:04:43.644990 | 2020-10-13 00:08:29.744502 | NaT | NaT | 2020-11-03 06:00:09.020005 | automated.verification | T0 |
| 1 | 040d9651-3ef2-46ed-9e28-6ad12bf00585 | 3401d86c-898b-4d6f-ba67-08b2bc005d5f | 309.98 | 8.0 | 0 | nan | 2020-10-13 00:24:50.475201 | NaT | NaT | NaT | 2020-10-27 09:00:09.906958 | automated.application_started | T0 |
| 2 | 71a279d5-5bce-4ea2-9e98-ec9d994256b1 | ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f | 523.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-13 00:26:48.749278 | 2020-10-13 00:27:53.698868 | 2020-10-13 12:19:55.282708 | NaT | 2021-02-11 06:30:08.285020 | automated.awaiting_payment | T0 |
| 3 | 706b5235-8424-4e91-b813-328dc5603e8e | 1fa64260-49bf-474e-8800-893c0c455a06 | 369.96 | 4.0 | 0 | nan | 2020-10-13 01:08:06.763585 | NaT | NaT | NaT | 2020-10-27 09:00:08.296637 | automated.application_started | T0 |
| 4 | d850645f-a7fe-4cd1-97db-8f1ac1364a67 | 90f815e9-c931-4d9e-aa79-f136bc5f227c | 479.98 | 8.0 | 0 | nan | 2020-10-13 01:10:40.530583 | NaT | NaT | NaT | 2020-10-27 09:00:08.090965 | automated.application_started | T0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39007 | c538afec-eabb-442c-b53f-0d8163ba679c | ff61dff9-88e4-4a85-85b9-2a912f156dc9 | 887.89 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-24 04:55:58.074156 | 2020-10-24 04:56:44.713389 | 2020-10-24 04:56:45.707634 | 2020-10-29 13:42:10.992583 | NaT | nan | T0 |
| 39008 | 8b6b254d-4a62-4579-b640-6a17d61cc76e | 880fb13c-70dc-4604-9199-6e158f7c4eda | 847.63 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-10-27 16:55:35.479453 | 2020-10-27 16:55:59.061944 | 2020-10-27 16:55:59.529787 | 2020-11-06 17:03:15.729864 | NaT | nan | T0 |
| 39009 | 5e1b86e3-1f17-4413-9551-5207de5b29c5 | fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 | 304.98 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-30 06:14:34.776961 | 2020-10-30 06:15:24.078435 | 2020-10-30 06:15:25.648940 | 2020-10-30 14:36:51.175908 | NaT | nan | T0 |
| 39010 | 601ee268-b2da-41ea-9470-0c2574e7410c | 54612ae8-64d4-426e-8c22-57d2791a0b86 | 478.96 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-11-04 01:13:43.302878 | 2020-11-04 01:14:35.764894 | 2020-11-04 01:14:37.253225 | 2020-11-13 15:05:15.799022 | NaT | nan | T0 |
| 39011 | da8f8586-6ac5-40c5-bdda-8e570151f35f | 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 | 647.94 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-11-06 02:20:08.042178 | 2020-11-06 02:21:09.263684 | 2020-11-06 02:21:11.054734 | 2020-11-20 15:25:37.351017 | NaT | nan | T0 |
39012 rows × 13 columns
# describe statistical information
loan_df.describe()
| amount | number_of_payments | user_pinwheel_eligible_at_ap | application_start_ts | application_complete_ts | awaiting_payment_ts | repayment_ts | canceled_ts | |
|---|---|---|---|---|---|---|---|---|
| count | 39012.000000 | 37066.000000 | 39012.000000 | 39012 | 27164 | 18275 | 1499 | 36295 |
| mean | 519.174718 | 9.621000 | 0.504768 | 2020-10-26 13:19:25.628109568 | 2020-10-26 17:44:53.409922560 | 2020-10-26 20:17:28.368069632 | 2020-11-10 16:48:34.291845632 | 2020-12-11 05:59:21.579855360 |
| min | 7.980000 | 1.000000 | 0.000000 | 2020-10-13 00:01:42.191775 | 2020-10-13 00:07:09.577877 | 2020-10-13 00:26:00.888018 | 2020-10-14 13:52:27.445060 | 2020-10-13 00:57:56.290905 |
| 25% | 284.930000 | 8.000000 | 0.000000 | 2020-10-21 09:47:13.928946688 | 2020-10-21 14:35:26.136439296 | 2020-10-21 16:52:21.867469568 | 2020-10-30 14:40:59.422031872 | 2020-11-01 09:00:28.966358272 |
| 50% | 479.980000 | 8.000000 | 1.000000 | 2020-10-27 13:22:13.275449088 | 2020-10-27 15:01:48.780180992 | 2020-10-27 16:16:25.904480 | 2020-11-06 17:04:05.031853056 | 2020-11-15 06:00:09.628438016 |
| 75% | 709.980000 | 16.000000 | 1.000000 | 2020-10-31 16:30:10.752801280 | 2020-10-31 21:19:59.311596032 | 2020-10-31 21:41:29.274088448 | 2020-11-13 15:05:15.291198976 | 2021-02-14 06:30:43.603794432 |
| max | 2682.980000 | 36.000000 | 1.000000 | 2020-11-06 23:59:45.190724 | 2020-11-19 19:11:21.439778 | 2020-12-04 11:25:52.610301 | 2021-04-02 14:22:56.357633 | 2021-04-08 05:46:53.563057 |
| std | 315.473946 | 4.462309 | 0.499984 | NaN | NaN | NaN | NaN | NaN |
# data information
loan_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 39012 entries, 0 to 39011 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 loan_id 39012 non-null object 1 user_id 39012 non-null object 2 amount 39012 non-null float64 3 number_of_payments 37066 non-null float64 4 user_pinwheel_eligible_at_ap 39012 non-null int64 5 approval_type 39012 non-null object 6 application_start_ts 39012 non-null datetime64[ns] 7 application_complete_ts 27164 non-null datetime64[ns] 8 awaiting_payment_ts 18275 non-null datetime64[ns] 9 repayment_ts 1499 non-null datetime64[ns] 10 canceled_ts 36295 non-null datetime64[ns] 11 cancellation_type 39012 non-null object 12 risk_tier_at_uw 39012 non-null object dtypes: datetime64[ns](5), float64(2), int64(1), object(5) memory usage: 3.9+ MB
loan_null_df = pd.DataFrame({"Null Values": loan_df.isnull().sum(),
"Percentage Null Values": (loan_df.isnull().sum()) / (loan_df.shape[0]) * 100
})
loan_null_df
| Null Values | Percentage Null Values | |
|---|---|---|
| loan_id | 0 | 0.000000 |
| user_id | 0 | 0.000000 |
| amount | 0 | 0.000000 |
| number_of_payments | 1946 | 4.988209 |
| user_pinwheel_eligible_at_ap | 0 | 0.000000 |
| approval_type | 0 | 0.000000 |
| application_start_ts | 0 | 0.000000 |
| application_complete_ts | 11848 | 30.370143 |
| awaiting_payment_ts | 20737 | 53.155439 |
| repayment_ts | 37513 | 96.157593 |
| canceled_ts | 2717 | 6.964524 |
| cancellation_type | 0 | 0.000000 |
| risk_tier_at_uw | 0 | 0.000000 |
loan_df.columns
Index(['loan_id', 'user_id', 'amount', 'number_of_payments',
'user_pinwheel_eligible_at_ap', 'approval_type', 'application_start_ts',
'application_complete_ts', 'awaiting_payment_ts', 'repayment_ts',
'canceled_ts', 'cancellation_type', 'risk_tier_at_uw'],
dtype='object')
loan_df['approval_type'].unique()
array(['nan', 'underwriting.auto.ftb_decision',
'underwriting.manual.approve', 'underwriting.manual_review',
'underwriting.manual.verification', 'underwriting.auto.deny',
'underwriting.manual.deny', 'underwriting.manual.override_approve',
'underwriting.auto.approve'], dtype=object)
loan_msno = loan_df[['loan_id', 'user_id', 'amount', 'number_of_payments',
'user_pinwheel_eligible_at_ap', 'application_start_ts',
'application_complete_ts', 'approval_type', 'awaiting_payment_ts' ,'repayment_ts',
'canceled_ts', 'cancellation_type', 'risk_tier_at_uw']]
msno.bar(loan_msno, color="blue")
plt.show()
loan_df.duplicated().sum()
0
loan_df.columns
Index(['loan_id', 'user_id', 'amount', 'number_of_payments',
'user_pinwheel_eligible_at_ap', 'approval_type', 'application_start_ts',
'application_complete_ts', 'awaiting_payment_ts', 'repayment_ts',
'canceled_ts', 'cancellation_type', 'risk_tier_at_uw'],
dtype='object')
loan_df
| loan_id | user_id | amount | number_of_payments | user_pinwheel_eligible_at_ap | approval_type | application_start_ts | application_complete_ts | awaiting_payment_ts | repayment_ts | canceled_ts | cancellation_type | risk_tier_at_uw | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 27f95ba4-e56f-4222-bc92-d73bf8d7d669 | beca750a-2041-4c26-8154-6aa85ae9b245 | 471.48 | 8.0 | 0 | nan | 2020-10-13 00:04:43.644990 | 2020-10-13 00:08:29.744502 | NaT | NaT | 2020-11-03 06:00:09.020005 | automated.verification | T0 |
| 1 | 040d9651-3ef2-46ed-9e28-6ad12bf00585 | 3401d86c-898b-4d6f-ba67-08b2bc005d5f | 309.98 | 8.0 | 0 | nan | 2020-10-13 00:24:50.475201 | NaT | NaT | NaT | 2020-10-27 09:00:09.906958 | automated.application_started | T0 |
| 2 | 71a279d5-5bce-4ea2-9e98-ec9d994256b1 | ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f | 523.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-13 00:26:48.749278 | 2020-10-13 00:27:53.698868 | 2020-10-13 12:19:55.282708 | NaT | 2021-02-11 06:30:08.285020 | automated.awaiting_payment | T0 |
| 3 | 706b5235-8424-4e91-b813-328dc5603e8e | 1fa64260-49bf-474e-8800-893c0c455a06 | 369.96 | 4.0 | 0 | nan | 2020-10-13 01:08:06.763585 | NaT | NaT | NaT | 2020-10-27 09:00:08.296637 | automated.application_started | T0 |
| 4 | d850645f-a7fe-4cd1-97db-8f1ac1364a67 | 90f815e9-c931-4d9e-aa79-f136bc5f227c | 479.98 | 8.0 | 0 | nan | 2020-10-13 01:10:40.530583 | NaT | NaT | NaT | 2020-10-27 09:00:08.090965 | automated.application_started | T0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39007 | c538afec-eabb-442c-b53f-0d8163ba679c | ff61dff9-88e4-4a85-85b9-2a912f156dc9 | 887.89 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-24 04:55:58.074156 | 2020-10-24 04:56:44.713389 | 2020-10-24 04:56:45.707634 | 2020-10-29 13:42:10.992583 | NaT | nan | T0 |
| 39008 | 8b6b254d-4a62-4579-b640-6a17d61cc76e | 880fb13c-70dc-4604-9199-6e158f7c4eda | 847.63 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-10-27 16:55:35.479453 | 2020-10-27 16:55:59.061944 | 2020-10-27 16:55:59.529787 | 2020-11-06 17:03:15.729864 | NaT | nan | T0 |
| 39009 | 5e1b86e3-1f17-4413-9551-5207de5b29c5 | fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 | 304.98 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-30 06:14:34.776961 | 2020-10-30 06:15:24.078435 | 2020-10-30 06:15:25.648940 | 2020-10-30 14:36:51.175908 | NaT | nan | T0 |
| 39010 | 601ee268-b2da-41ea-9470-0c2574e7410c | 54612ae8-64d4-426e-8c22-57d2791a0b86 | 478.96 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-11-04 01:13:43.302878 | 2020-11-04 01:14:35.764894 | 2020-11-04 01:14:37.253225 | 2020-11-13 15:05:15.799022 | NaT | nan | T0 |
| 39011 | da8f8586-6ac5-40c5-bdda-8e570151f35f | 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 | 647.94 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-11-06 02:20:08.042178 | 2020-11-06 02:21:09.263684 | 2020-11-06 02:21:11.054734 | 2020-11-20 15:25:37.351017 | NaT | nan | T0 |
39012 rows × 13 columns
loan_df.groupby(['loan_id', 'user_id', 'amount', 'number_of_payments',
'user_pinwheel_eligible_at_ap', 'approval_type', 'application_start_ts',
'application_complete_ts', 'awaiting_payment_ts', 'repayment_ts',
])['user_id'].count().reset_index(name= 'Distinct Count')
| loan_id | user_id | amount | number_of_payments | user_pinwheel_eligible_at_ap | approval_type | application_start_ts | application_complete_ts | awaiting_payment_ts | repayment_ts | Distinct Count | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00063e85-da3b-4e86-80be-fb08e65875c7 | 41659249-af7c-41cd-8dbf-3d0b2cb38a8c | 81.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-23 00:52:17.494655 | 2020-10-23 00:53:47.084478 | 2020-10-23 00:53:48.970713 | 2020-10-30 14:41:03.110715 | 1 |
| 1 | 00100b15-07f9-4717-b456-bee53086dfc2 | 290e2109-dc8b-48ec-bb13-28fa6a97d152 | 559.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-26 16:42:57.134918 | 2020-10-26 16:44:11.731562 | 2020-10-26 16:44:13.787265 | 2020-11-25 14:52:42.767518 | 1 |
| 2 | 0032f35a-6822-4707-a912-56cb0a27bcf4 | b6853856-c6b5-469d-b51b-0287628497fc | 372.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-22 22:57:42.992983 | 2020-10-22 22:58:57.731909 | 2020-10-22 22:58:59.456508 | 2020-10-30 14:40:15.237530 | 1 |
| 3 | 00730e12-04ca-45dd-a8c7-8800cfb76ddc | 13e58d18-208f-433b-b199-27dfdac1acb1 | 839.92 | 4.0 | 0 | underwriting.auto.ftb_decision | 2020-10-26 10:33:23.903355 | 2020-10-26 10:34:55.334010 | 2020-10-26 10:34:57.350641 | 2020-11-02 14:11:54.774641 | 1 |
| 4 | 007b93b8-b52f-46c8-9818-c4cc6e7d7aff | dfb32ba5-6d43-45d9-b5f9-bdde4e42ac52 | 531.96 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-11-04 15:17:23.889207 | 2020-11-04 15:17:57.992434 | 2020-11-04 15:17:58.589918 | 2020-11-06 17:04:09.530275 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1494 | ffc96b15-4e32-425c-9a9d-8a44fd3c5a47 | 0ea245be-b7ce-4b61-9881-204b069021dc | 212.98 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-11-02 00:00:14.242705 | 2020-11-02 00:02:21.840981 | 2020-11-02 00:02:24.306445 | 2020-11-06 16:59:54.164913 | 1 |
| 1495 | ffcd70e3-6d73-4d0b-8d7a-60513e34b0b4 | da2ad2c7-4764-44d3-8fa9-5383fceab818 | 569.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-11-06 19:47:36.811747 | 2020-11-06 19:47:45.782024 | 2020-11-06 20:29:35.276487 | 2020-11-20 15:26:49.611627 | 1 |
| 1496 | ffdb7b76-a006-4e76-9c0b-abe87467a9f0 | 1507510f-4043-4913-a1d4-bd263fac3685 | 749.96 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-27 16:41:27.857770 | 2020-10-27 16:42:08.599788 | 2020-10-27 16:42:10.179225 | 2020-11-06 16:56:03.549415 | 1 |
| 1497 | ffe68a35-bf91-43a8-bdd7-96a7bec7f258 | b7781673-9c9e-4caa-83c6-39b7e8a237e0 | 686.92 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-14 15:56:15.302675 | 2020-10-14 17:02:40.962480 | 2020-10-14 17:02:42.831729 | 2020-11-06 17:03:00.221545 | 1 |
| 1498 | ffe92b81-c940-4e1f-aacb-64faf27c15df | 93916de2-8dbd-4a77-af9e-a916db9cfc37 | 86.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-13 01:20:02.370067 | 2020-10-13 01:20:26.633620 | 2020-10-13 01:20:31.748747 | 2020-10-23 13:59:02.709800 | 1 |
1499 rows × 11 columns
sns.histplot(x = 'amount', data = loan_df)
## Calculate the lower and upper quantiles (e.g., 25th and 75th percentiles)
Q1 = np.percentile(loan_df['amount'],25)
Q3 = np.percentile(loan_df['amount'],75)
# Calculate the interquartile range (IQR)
IQR = Q3 - Q1
# Define threshold for identifying outliers (e.g., based on IQR)
lower_threshold = Q1 - 1.5*IQR
upper_threshold = Q3 +1.5*IQR
# Identify and mark outliers
outliers = [x for x in loan_df['amount'] if x < lower_threshold or x > upper_threshold]
#x represents each individual data point in the data list.
#The code checks if x is either less than the lower_threshold or greater than the upper_threshold. If either condition is true, it considers x as an outlier.
#If the condition is true for a particular x, it includes that x in the outliers list.
plt.scatter(outliers, [0] * len(outliers), color='red', marker='o', label='Outliers')
# Optional: Customize plot labels and title
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Amount with Outliers')
# Show the plot
plt.legend()
plt.show()
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
user_profile = ProfileReport(user_df, title="User Profiling Report")
user_profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
loan_profile = ProfileReport(loan_df, title="Loan Profiling Report")
loan_profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
user_df[['user_id','signup_dt']].nunique()
user_id 95575 signup_dt 95575 dtype: int64
total_signup = len(user_df.groupby(['user_id', 'signup_dt'])['signup_dt'].count().reset_index(name='mycol'))
total_signup
95575
total_app_start = user_df[['first_application_start_ts']].notna().sum()
total_app_start
first_application_start_ts 25105 dtype: int64
total_app_Fstart = user_df[['first_application_start_ts']].isnull().sum()
total_app_Fstart
first_application_start_ts 70470 dtype: int64
app_init_rate = total_app_start/total_signup *100
print(f"Application initiation rate is {app_init_rate.iloc[0].round(2)}")
Application initiation rate is 26.27
total_app_compl = user_df[['first_application_complete_ts']].notna().sum()
total_app_compl
first_application_complete_ts 18948 dtype: int64
total_app_Fcompl = user_df[['first_application_complete_ts']].isnull().sum()
total_app_Fcompl
first_application_complete_ts 76627 dtype: int64
app_compl_rate = total_app_compl/total_signup *100
print(f"Application Completetion rate is {app_compl_rate.iloc[0].round(2)}")
Application Completetion rate is 19.83
total_approve = user_df[['first_awaiting_payment_ts']].notna().sum()
total_approve
first_awaiting_payment_ts 14388 dtype: int64
total_Fapprove = user_df[['first_awaiting_payment_ts']].isnull().sum()
total_Fapprove
first_awaiting_payment_ts 81187 dtype: int64
approval_rate = total_approve/total_signup *100
print(f"Loan approval rate is {approval_rate.iloc[0].round(2)}")
Loan approval rate is 15.05
total_setup = user_df[['first_paystub_dt']].notna().sum()
total_setup
first_paystub_dt 8839 dtype: int64
total_Fsetup = user_df[['first_paystub_dt']].isnull().sum()
total_Fsetup
first_paystub_dt 86736 dtype: int64
setup_rate = total_setup/total_signup *100
print(f"Payroll direct deposite setup approval rate is {setup_rate.iloc[0].round(2)}")
Payroll direct deposite setup approval rate is 9.25
total_repay = user_df[['first_repayment_ts']].notna().sum()
total_repay
first_repayment_ts 1524 dtype: int64
total_Frepay = user_df[['first_repayment_ts']].isnull().sum()
total_Frepay
first_repayment_ts 94051 dtype: int64
repay_rate = total_repay/total_signup *100
repay_rate.round(2)
print(f"repayment conversion rate is {repay_rate.iloc[0].round(2)}")
repayment conversion rate is 1.59
app_time_df = user_df
#Extract the day from the timestamps
app_time_df['day_difference'] = (user_df['first_application_complete_ts'] - user_df['first_application_start_ts']).dt.days
#Find the time takes to fill loan application
app_time_df['time_difference'] = (user_df['first_application_complete_ts'] - user_df['first_application_start_ts'])
# Average time of filling application within 1 day¶
avg_app_ts = app_time_df[app_time_df['day_difference']<1]['time_difference'].mean()
avg_app_ts
print(f"Average time of filling application within 1 day¶ is {avg_app_ts}")
Average time of filling application within 1 day¶ is 0 days 00:18:15.224973373
refer_df = user_df[['was_referred_ind','first_repayment_ts']].dropna()
refer_df_2 = refer_df.groupby('was_referred_ind')['first_repayment_ts'].count().reset_index(name = 'number_of_refer')
refer_df_2
| was_referred_ind | number_of_refer | |
|---|---|---|
| 0 | 0 | 1426 |
| 1 | 1 | 98 |
referral_rate =(refer_df_2['number_of_refer'].iloc[1]
/(refer_df_2['number_of_refer'].iloc[1] + refer_df_2['number_of_refer'].iloc[0])
*100).round(2)
referral_rate
6.43
refer_df = user_df.groupby('was_referred_ind')['user_id'].count().reset_index(name = 'number_of_refer')
refer_df
| was_referred_ind | number_of_refer | |
|---|---|---|
| 0 | 0 | 92213 |
| 1 | 1 | 3362 |
(refer_df['number_of_refer'].iloc[1]
/(refer_df['number_of_refer'].iloc[1] + refer_df['number_of_refer'].iloc[0])
*100).round(2)
3.52
cancel_df = loan_df
cancel_df.groupby('cancellation_type')['cancellation_type'].count().sort_values(ascending = False)
cancellation_type automated.awaiting_payment 9715 automated.application_started 7996 automated.pending 4455 remorse 3499 nan 2757 qualify_test 2345 accident 2065 other_interests 1649 deposit_failure 1229 didnt_realize_direct_deposit 688 automated.verification 563 purchased_elsewhere 507 spending_limit_exceeded 505 magento.edit_order.cancel 437 duplicate_order 418 1_covid_elevated_risk 143 automated.deactivate_user_request 40 deny 1 Name: cancellation_type, dtype: int64
user_df
| user_id | signup_dt | company_name | spending_limit_est | valid_phone_ind | last_login | was_referred_ind | first_paystub_dt | first_application_start_ts | first_application_complete_ts | first_awaiting_payment_ts | first_repayment_ts | day_difference | time_difference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc | 2020-10-13 00:06:21.603537 | Other > Add a new company | 1000.0 | 1 | 2022-11-22 01:21:08.832662 | 0 | NaT | NaT | NaT | NaT | NaT | NaN | NaT |
| 1 | 2752a3f2-0a96-42f9-bcc3-5fde7edec06b | 2020-10-13 00:12:20.590843 | Other > Add a new company | 700.0 | 1 | 2020-12-31 01:50:17.564992 | 0 | NaT | 2020-12-31 01:50:18.097958 | NaT | NaT | NaT | NaN | NaT |
| 2 | 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 | 2020-10-13 00:15:26.586054 | NaN | NaN | 0 | 2020-10-13 00:15:27.200942 | 0 | NaT | NaT | NaT | NaT | NaT | NaN | NaT |
| 3 | 0067c279-3b94-4867-ae51-aa182cfc8dc3 | 2020-10-13 00:17:16.041499 | Other > Add a new company | NaN | 1 | 2020-10-27 15:27:35.952604 | 0 | NaT | NaT | NaT | NaT | NaT | NaN | NaT |
| 4 | 972e17fd-95b5-4bcc-8f17-c4323abd6c7b | 2020-10-13 00:21:46.310764 | NaN | NaN | 0 | 2020-10-13 00:21:46.398749 | 0 | NaT | NaT | NaT | NaT | NaT | NaN | NaT |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95570 | 9d562896-402f-43fb-b2e8-95674b59c00c | 2020-10-19 00:03:58.651384 | Vishay | 1000.0 | 1 | 2023-01-16 18:52:38.283396 | 0 | NaT | 2022-11-20 16:59:13.045054 | 2022-11-20 17:02:13.393381 | 2022-11-20 17:02:15.975683 | 2022-11-23 13:49:14.122587 | 0.0 | 0 days 00:03:00.348327 |
| 95571 | 2db26318-1e86-4611-8343-f22c2f52f700 | 2020-10-27 17:36:53.948341 | Anthem, Inc. | 800.0 | 1 | 2023-01-16 19:02:09.110595 | 0 | 2020-11-02 20:37:56.130803 | 2020-11-01 23:23:57.524518 | 2020-11-01 23:25:14.111450 | 2020-11-02 20:39:23.167071 | 2021-09-03 14:12:56.800775 | 0.0 | 0 days 00:01:16.586932 |
| 95572 | 19444a27-e573-4b06-9ebf-f49a80cb27ba | 2020-11-05 13:43:05.890775 | Travelers Insurance | 1000.0 | 1 | 2023-01-15 11:35:48.447004 | 0 | 2021-07-13 23:44:08.466518 | 2022-02-04 16:25:05.347940 | 2022-03-22 22:05:36.403078 | 2022-03-22 22:05:39.519980 | 2022-04-08 13:48:49.297660 | 46.0 | 46 days 05:40:31.055138 |
| 95573 | b2cf7001-2145-4653-a58f-26f658d6fed0 | 2020-11-01 22:49:39.533347 | United States Army (Civilians) | 1000.0 | 1 | 2022-12-19 13:58:21.389337 | 0 | 2020-11-20 23:54:27.624186 | 2020-11-19 19:16:24.802811 | 2020-11-20 23:45:58.574113 | 2020-11-23 14:18:21.049638 | NaT | 1.0 | 1 days 04:29:33.771302 |
| 95574 | a9b1e320-847f-438f-b9c6-d3b374810d6c | 2020-10-24 15:36:18.286066 | Indian Health Service | 1000.0 | 1 | 2023-01-15 20:36:10.793236 | 0 | NaT | 2023-01-15 20:15:18.820641 | 2023-01-15 20:17:26.570373 | 2023-01-15 20:17:33.152737 | 2023-01-15 20:28:19.813885 | 0.0 | 0 days 00:02:07.749732 |
95575 rows × 14 columns
#unique user_id in user dataset
len(user_df['user_id'].unique())
95575
loan_df
| loan_id | user_id | amount | number_of_payments | user_pinwheel_eligible_at_ap | approval_type | application_start_ts | application_complete_ts | awaiting_payment_ts | repayment_ts | canceled_ts | cancellation_type | risk_tier_at_uw | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 27f95ba4-e56f-4222-bc92-d73bf8d7d669 | beca750a-2041-4c26-8154-6aa85ae9b245 | 471.48 | 8.0 | 0 | nan | 2020-10-13 00:04:43.644990 | 2020-10-13 00:08:29.744502 | NaT | NaT | 2020-11-03 06:00:09.020005 | automated.verification | T0 |
| 1 | 040d9651-3ef2-46ed-9e28-6ad12bf00585 | 3401d86c-898b-4d6f-ba67-08b2bc005d5f | 309.98 | 8.0 | 0 | nan | 2020-10-13 00:24:50.475201 | NaT | NaT | NaT | 2020-10-27 09:00:09.906958 | automated.application_started | T0 |
| 2 | 71a279d5-5bce-4ea2-9e98-ec9d994256b1 | ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f | 523.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-13 00:26:48.749278 | 2020-10-13 00:27:53.698868 | 2020-10-13 12:19:55.282708 | NaT | 2021-02-11 06:30:08.285020 | automated.awaiting_payment | T0 |
| 3 | 706b5235-8424-4e91-b813-328dc5603e8e | 1fa64260-49bf-474e-8800-893c0c455a06 | 369.96 | 4.0 | 0 | nan | 2020-10-13 01:08:06.763585 | NaT | NaT | NaT | 2020-10-27 09:00:08.296637 | automated.application_started | T0 |
| 4 | d850645f-a7fe-4cd1-97db-8f1ac1364a67 | 90f815e9-c931-4d9e-aa79-f136bc5f227c | 479.98 | 8.0 | 0 | nan | 2020-10-13 01:10:40.530583 | NaT | NaT | NaT | 2020-10-27 09:00:08.090965 | automated.application_started | T0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39007 | c538afec-eabb-442c-b53f-0d8163ba679c | ff61dff9-88e4-4a85-85b9-2a912f156dc9 | 887.89 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-24 04:55:58.074156 | 2020-10-24 04:56:44.713389 | 2020-10-24 04:56:45.707634 | 2020-10-29 13:42:10.992583 | NaT | nan | T0 |
| 39008 | 8b6b254d-4a62-4579-b640-6a17d61cc76e | 880fb13c-70dc-4604-9199-6e158f7c4eda | 847.63 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-10-27 16:55:35.479453 | 2020-10-27 16:55:59.061944 | 2020-10-27 16:55:59.529787 | 2020-11-06 17:03:15.729864 | NaT | nan | T0 |
| 39009 | 5e1b86e3-1f17-4413-9551-5207de5b29c5 | fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 | 304.98 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-30 06:14:34.776961 | 2020-10-30 06:15:24.078435 | 2020-10-30 06:15:25.648940 | 2020-10-30 14:36:51.175908 | NaT | nan | T0 |
| 39010 | 601ee268-b2da-41ea-9470-0c2574e7410c | 54612ae8-64d4-426e-8c22-57d2791a0b86 | 478.96 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-11-04 01:13:43.302878 | 2020-11-04 01:14:35.764894 | 2020-11-04 01:14:37.253225 | 2020-11-13 15:05:15.799022 | NaT | nan | T0 |
| 39011 | da8f8586-6ac5-40c5-bdda-8e570151f35f | 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 | 647.94 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-11-06 02:20:08.042178 | 2020-11-06 02:21:09.263684 | 2020-11-06 02:21:11.054734 | 2020-11-20 15:25:37.351017 | NaT | nan | T0 |
39012 rows × 13 columns
#unique loan_id in loan dataset
len(loan_df['user_id'].unique())
31656
user_appstart = user_df[['user_id','first_application_start_ts']]
user_appstart = user_appstart.dropna()
user_appstart
| user_id | first_application_start_ts | |
|---|---|---|
| 1 | 2752a3f2-0a96-42f9-bcc3-5fde7edec06b | 2020-12-31 01:50:18.097958 |
| 5 | f25d9c31-0a14-434a-adc7-f812685ba9c2 | 2020-10-13 00:30:38.930504 |
| 8 | c1651717-68e9-40da-88b6-34ac52261114 | 2020-10-22 16:21:01.047124 |
| 15 | f7424169-db78-4451-89a4-b74ebbbc7b03 | 2020-10-13 02:17:57.782537 |
| 16 | cb022b83-df8c-4173-bfd0-f6a44922b370 | 2020-10-13 01:49:45.691136 |
| ... | ... | ... |
| 95570 | 9d562896-402f-43fb-b2e8-95674b59c00c | 2022-11-20 16:59:13.045054 |
| 95571 | 2db26318-1e86-4611-8343-f22c2f52f700 | 2020-11-01 23:23:57.524518 |
| 95572 | 19444a27-e573-4b06-9ebf-f49a80cb27ba | 2022-02-04 16:25:05.347940 |
| 95573 | b2cf7001-2145-4653-a58f-26f658d6fed0 | 2020-11-19 19:16:24.802811 |
| 95574 | a9b1e320-847f-438f-b9c6-d3b374810d6c | 2023-01-15 20:15:18.820641 |
25105 rows × 2 columns
loan_df[['user_id','application_start_ts']]
| user_id | application_start_ts | |
|---|---|---|
| 0 | beca750a-2041-4c26-8154-6aa85ae9b245 | 2020-10-13 00:04:43.644990 |
| 1 | 3401d86c-898b-4d6f-ba67-08b2bc005d5f | 2020-10-13 00:24:50.475201 |
| 2 | ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f | 2020-10-13 00:26:48.749278 |
| 3 | 1fa64260-49bf-474e-8800-893c0c455a06 | 2020-10-13 01:08:06.763585 |
| 4 | 90f815e9-c931-4d9e-aa79-f136bc5f227c | 2020-10-13 01:10:40.530583 |
| ... | ... | ... |
| 39007 | ff61dff9-88e4-4a85-85b9-2a912f156dc9 | 2020-10-24 04:55:58.074156 |
| 39008 | 880fb13c-70dc-4604-9199-6e158f7c4eda | 2020-10-27 16:55:35.479453 |
| 39009 | fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 | 2020-10-30 06:14:34.776961 |
| 39010 | 54612ae8-64d4-426e-8c22-57d2791a0b86 | 2020-11-04 01:13:43.302878 |
| 39011 | 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 | 2020-11-06 02:20:08.042178 |
39012 rows × 2 columns
# Perform an inner join in pandas
merged_inner = pd.merge(user_appstart, loan_df, on='user_id', how='inner')
merged_inner
| user_id | first_application_start_ts | loan_id | amount | number_of_payments | user_pinwheel_eligible_at_ap | approval_type | application_start_ts | application_complete_ts | awaiting_payment_ts | repayment_ts | canceled_ts | cancellation_type | risk_tier_at_uw | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | f25d9c31-0a14-434a-adc7-f812685ba9c2 | 2020-10-13 00:30:38.930504 | 1e1321c0-c3ac-413c-93eb-829753e18357 | 485.94 | 8.0 | 0 | nan | 2020-10-13 00:30:38.930504 | NaT | NaT | NaT | 2020-10-27 09:00:09.552342 | automated.application_started | T0 |
| 1 | c1651717-68e9-40da-88b6-34ac52261114 | 2020-10-22 16:21:01.047124 | cb434a06-ffe1-4f13-a776-f320c1f32d65 | 100.98 | NaN | 0 | nan | 2020-10-22 16:21:01.047124 | NaT | NaT | NaT | 2020-11-06 10:00:56.076354 | automated.application_started | T0 |
| 2 | f7424169-db78-4451-89a4-b74ebbbc7b03 | 2020-10-13 02:17:57.782537 | fdd339c4-3702-41e8-91fe-1aad6e17bd57 | 528.92 | 8.0 | 0 | nan | 2020-10-13 02:17:57.782537 | NaT | NaT | NaT | 2020-10-27 09:00:16.373991 | automated.application_started | T0 |
| 3 | cb022b83-df8c-4173-bfd0-f6a44922b370 | 2020-10-13 01:49:45.691136 | c952956e-144c-40fe-b5d6-bc84e4dbe37a | 262.98 | 4.0 | 0 | nan | 2020-10-13 01:49:45.691136 | 2020-10-19 18:20:01.904799 | NaT | NaT | 2020-10-19 18:24:04.161829 | remorse | T0 |
| 4 | 957bd9b4-a70b-4eba-94b0-b75fd39e0635 | 2020-10-13 03:22:20.637457 | 6c4945bf-ed63-41ff-8fb8-641cb53310f3 | 953.19 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-10-13 03:22:20.637457 | 2020-10-13 03:25:10.487610 | 2020-10-13 11:47:52.548097 | NaT | 2021-02-11 06:30:09.929995 | automated.awaiting_payment | T0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20873 | 4d94cbb8-5cba-47b2-a8fa-bba98badadca | 2020-10-28 17:37:04.274041 | ffd85f51-9e03-4202-ab71-ee06a5438c6e | 569.98 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-10-30 22:32:19.815574 | 2020-10-30 22:32:54.834815 | 2020-10-30 22:32:55.437703 | NaT | 2021-02-28 06:31:23.386452 | automated.awaiting_payment | T0 |
| 20874 | b411e962-2255-467d-b97f-65ea0d8f771d | 2020-10-24 04:44:31.258199 | a6ce21f0-bd8c-4175-b514-9f6b3173569a | 382.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-24 04:49:58.575089 | 2020-10-24 04:51:16.920086 | 2020-10-24 04:51:19.087176 | 2020-11-12 14:33:42.146713 | NaT | nan | T0 |
| 20875 | b411e962-2255-467d-b97f-65ea0d8f771d | 2020-10-24 04:44:31.258199 | f9eb9cb6-79e9-4d61-bed9-a6fa1a4813d5 | 839.98 | 8.0 | 1 | nan | 2020-10-24 04:44:31.258199 | NaT | NaT | NaT | 2020-10-24 04:46:54.132675 | remorse | T0 |
| 20876 | e691a8bc-e598-4077-ada7-8da2c7bd6dca | 2020-10-23 23:30:21.399193 | e2b80d0e-3536-46b5-92c1-927b59f4ccbd | 678.52 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-23 23:30:21.399193 | 2020-10-23 23:31:05.948503 | 2020-10-23 23:31:07.998526 | NaT | 2021-02-21 06:31:19.630065 | automated.awaiting_payment | T0 |
| 20877 | 2db26318-1e86-4611-8343-f22c2f52f700 | 2020-11-01 23:23:57.524518 | 3f4fc6a0-566c-4fca-8d7b-614b663f75f8 | 509.98 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-11-01 23:23:57.524518 | 2020-11-01 23:25:14.111450 | 2020-11-02 20:39:23.167071 | NaT | 2020-12-04 20:32:11.484867 | other_interests | T0 |
20878 rows × 14 columns
# Check if the number of unique users in the merged dataframe is equal to that in the loan_df
len(merged_inner['user_id'].unique()) == len(loan_df['user_id'].unique())
False
len(merged_inner['user_id'].unique())
17042
len(loan_df['user_id'].unique())
31656
# describe statistical information
user_df.describe()
#no user_id, no company name
| signup_dt | spending_limit_est | valid_phone_ind | last_login | was_referred_ind | first_paystub_dt | first_application_start_ts | first_application_complete_ts | first_awaiting_payment_ts | first_repayment_ts | day_difference | time_difference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 95575 | 69055.000000 | 95575.000000 | 95575 | 95575.000000 | 8839 | 25105 | 18948 | 14388 | 1524 | 18948.000000 | 18948 |
| mean | 2020-10-26 07:30:46.781417984 | 837.165216 | 0.813947 | 2021-03-01 14:29:40.952891136 | 0.035177 | 2021-02-20 00:47:41.967247872 | 2021-01-01 01:58:32.123276800 | 2021-01-09 17:34:31.836468224 | 2021-01-24 12:24:51.778903552 | 2021-04-23 19:11:41.864473856 | 11.244459 | 11 days 07:05:48.874714700 |
| min | 2020-10-13 00:00:13.593771 | 50.000000 | 0.000000 | 2020-10-13 00:01:32.317156 | 0.000000 | 2016-11-12 00:40:01.316014 | 2016-11-12 00:22:57.724906 | 2016-11-12 00:40:24.715791 | 2016-11-12 18:17:14.283158 | 2016-11-30 21:17:45.514341 | 0.000000 | 0 days 00:00:12.098707 |
| 25% | 2020-10-20 06:54:35.580967424 | 650.000000 | 1.000000 | 2020-10-24 22:18:45.041860864 | 0.000000 | 2020-10-26 18:07:49.435676928 | 2020-10-24 03:36:48.439672064 | 2020-10-24 23:20:00.373304832 | 2020-10-26 00:51:18.425989888 | 2020-11-13 15:04:34.914781440 | 0.000000 | 0 days 00:00:53.940616750 |
| 50% | 2020-10-27 01:36:18.374267904 | 1000.000000 | 1.000000 | 2020-11-02 08:15:36.039156992 | 0.000000 | 2020-11-05 02:37:25.778889984 | 2020-11-01 01:38:02.628940032 | 2020-11-02 02:40:46.823124992 | 2020-11-03 20:59:17.371486464 | 2020-12-04 11:47:32.950610688 | 0.000000 | 0 days 00:01:15.089982 |
| 75% | 2020-11-01 05:30:42.165230592 | 1000.000000 | 1.000000 | 2021-01-10 07:02:38.536221952 | 0.000000 | 2021-01-17 04:38:59.484386560 | 2020-11-19 00:52:01.452822016 | 2020-11-30 19:18:59.148196608 | 2020-12-13 18:41:46.900191232 | 2021-09-17 14:33:02.806388992 | 0.000000 | 0 days 00:02:19.709684 |
| max | 2020-11-06 23:59:49.199581 | 1000.000000 | 1.000000 | 2023-01-16 19:26:10.352780 | 1.000000 | 2023-01-12 07:15:04.604709 | 2023-01-16 13:38:13.593884 | 2023-01-16 13:39:30.223596 | 2023-01-16 13:39:32.495406 | 2023-01-15 20:28:19.813885 | 1246.000000 | 1246 days 03:43:44.088223 |
| std | NaN | 238.771310 | 0.389151 | NaN | 0.184227 | NaN | NaN | NaN | NaN | NaN | 74.546476 | 74 days 14:45:03.149283250 |
# Create a boolean mask for the constraint
constraint_mask = (
(user_df['signup_dt'] < user_df['first_application_start_ts']) &
(user_df['first_application_start_ts'] < user_df['first_application_complete_ts']) &
(user_df['first_application_complete_ts'] < user_df['first_awaiting_payment_ts']) &
(user_df['first_awaiting_payment_ts'] < user_df['first_repayment_ts']) &
(user_df['spending_limit_est'] > 0) # Assuming that spending limit should be greater than 0
)
# Apply the mask to filter rows that satisfy the constraint
filtered_user_df = user_df[constraint_mask]
filtered_user_df.shape
(1496, 14)
user_df[user_df['first_paystub_dt'] == '2016-11-12 00:40:01.316014']
| user_id | signup_dt | company_name | spending_limit_est | valid_phone_ind | last_login | was_referred_ind | first_paystub_dt | first_application_start_ts | first_application_complete_ts | first_awaiting_payment_ts | first_repayment_ts | day_difference | time_difference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 236 | bc5b5d91-1118-47fc-895e-91cc709e7e5a | 2020-10-14 00:16:44.491340 | United States Postal Service | 1000.0 | 1 | 2022-10-26 00:43:13.947191 | 1 | 2016-11-12 00:40:01.316014 | 2016-11-12 00:22:57.724906 | 2016-11-12 00:40:24.715791 | 2016-11-12 18:17:14.283158 | 2016-11-30 21:17:45.514341 | 0.0 | 0 days 00:17:26.990885 |
❖ Start a loan application by checking out on an item(s)
❖ Complete an application by providing some identifying information
❖ Approval/Denial of the loan application
❖ Set up a payroll direct deposit based on the agreed terms of the approved loan application
❖ Loan enters repayment after the first direct deposit payment has been made and the item(s) will be shipped
user_df.columns
Index(['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
'valid_phone_ind', 'last_login', 'was_referred_ind', 'first_paystub_dt',
'first_application_start_ts', 'first_application_complete_ts',
'first_awaiting_payment_ts', 'first_repayment_ts', 'day_difference',
'time_difference'],
dtype='object')
A_repay = user_df[['user_id','first_awaiting_payment_ts','first_repayment_ts']]
A_repay = A_repay[A_repay['first_awaiting_payment_ts'].notnull()]
A_repay
| user_id | first_awaiting_payment_ts | first_repayment_ts | |
|---|---|---|---|
| 22 | 957bd9b4-a70b-4eba-94b0-b75fd39e0635 | 2020-10-13 11:47:52.548097 | NaT |
| 39 | 020d9f88-5cae-434a-83b6-d268b1c51276 | 2020-10-15 15:22:54.337590 | NaT |
| 46 | da60418c-b6a8-4d6b-ba74-e7015788e099 | 2021-01-27 00:03:31.446748 | NaT |
| 53 | d7335de9-e9e9-4eba-9909-bf7dbaff54e5 | 2020-10-15 06:23:24.747991 | 2020-10-23 13:59:24.148156 |
| 59 | b90038d6-cf39-4fc5-b5a9-cefe597e3315 | 2022-02-26 06:02:31.446417 | NaT |
| ... | ... | ... | ... |
| 95570 | 9d562896-402f-43fb-b2e8-95674b59c00c | 2022-11-20 17:02:15.975683 | 2022-11-23 13:49:14.122587 |
| 95571 | 2db26318-1e86-4611-8343-f22c2f52f700 | 2020-11-02 20:39:23.167071 | 2021-09-03 14:12:56.800775 |
| 95572 | 19444a27-e573-4b06-9ebf-f49a80cb27ba | 2022-03-22 22:05:39.519980 | 2022-04-08 13:48:49.297660 |
| 95573 | b2cf7001-2145-4653-a58f-26f658d6fed0 | 2020-11-23 14:18:21.049638 | NaT |
| 95574 | a9b1e320-847f-438f-b9c6-d3b374810d6c | 2023-01-15 20:17:33.152737 | 2023-01-15 20:28:19.813885 |
14388 rows × 3 columns
A_num_loan = A_repay[['first_repayment_ts']].notnull().sum().iloc[0]
print(f"There are {A_num_loan} loans enter repayment of approvals.")
There are 1524 loans enter repayment of approvals.
A_repayN = A_repay.dropna()
# Create an independent copy of the DataFrame slice
#By using A_repayN.copy(), you create a new DataFrame that is independent of the original one.
#Any modifications you make to this copy won't trigger the SettingWithCopyWarning,
#as it's clear to Pandas that A_repayN is a standalone DataFrame.
A_repayN = A_repayN.copy()
# Now perform your operation
A_repayN['timediff'] = A_repayN['first_repayment_ts'] - A_repayN['first_awaiting_payment_ts']
A_repayN['daysdiff'] =A_repayN['timediff'].dt.days
A_repay_result = A_repayN[A_repayN['daysdiff']<=15].reset_index()
A_repay_result
| index | user_id | first_awaiting_payment_ts | first_repayment_ts | timediff | daysdiff | |
|---|---|---|---|---|---|---|
| 0 | 53 | d7335de9-e9e9-4eba-9909-bf7dbaff54e5 | 2020-10-15 06:23:24.747991 | 2020-10-23 13:59:24.148156 | 8 days 07:35:59.400165 | 8 |
| 1 | 334 | 50fdac43-85ba-4bae-9ed2-1de7180e367e | 2020-11-16 11:25:29.719128 | 2020-11-25 14:52:38.530067 | 9 days 03:27:08.810939 | 9 |
| 2 | 370 | 298c7fd1-aa16-49b8-a9bc-f9d270a5f6b1 | 2020-10-14 15:23:23.809563 | 2020-10-22 13:35:26.253276 | 7 days 22:12:02.443713 | 7 |
| 3 | 753 | f9132e9e-15c1-48b4-aa12-c30a1104bb82 | 2020-12-19 21:24:39.993132 | 2020-12-24 15:10:36.806978 | 4 days 17:45:56.813846 | 4 |
| 4 | 840 | 13e58d18-208f-433b-b199-27dfdac1acb1 | 2020-10-26 10:34:57.350641 | 2020-11-02 14:11:54.774641 | 7 days 03:36:57.424000 | 7 |
| ... | ... | ... | ... | ... | ... | ... |
| 809 | 95558 | 58c5d22c-f5ff-4af9-99b7-10b32e6f2c34 | 2021-09-30 07:35:04.399192 | 2021-10-02 17:05:03.733286 | 2 days 09:29:59.334094 | 2 |
| 810 | 95561 | 2b5ceb0a-8a9f-46c5-9745-87ab0ec794e9 | 2022-11-27 01:33:17.896867 | 2022-12-07 13:06:32.459778 | 10 days 11:33:14.562911 | 10 |
| 811 | 95568 | 8a686a7a-cb47-489f-99e6-50bfd4755601 | 2022-09-28 14:01:48.565190 | 2022-10-14 13:57:22.435704 | 15 days 23:55:33.870514 | 15 |
| 812 | 95570 | 9d562896-402f-43fb-b2e8-95674b59c00c | 2022-11-20 17:02:15.975683 | 2022-11-23 13:49:14.122587 | 2 days 20:46:58.146904 | 2 |
| 813 | 95574 | a9b1e320-847f-438f-b9c6-d3b374810d6c | 2023-01-15 20:17:33.152737 | 2023-01-15 20:28:19.813885 | 0 days 00:10:46.661148 | 0 |
814 rows × 6 columns
print(f"There are {len(A_repay_result)} loans enter repayment within 15 days of approvals.")
There are 814 loans enter repayment within 15 days of approvals.
loan_df.columns
Index(['loan_id', 'user_id', 'amount', 'number_of_payments',
'user_pinwheel_eligible_at_ap', 'approval_type', 'application_start_ts',
'application_complete_ts', 'awaiting_payment_ts', 'repayment_ts',
'canceled_ts', 'cancellation_type', 'risk_tier_at_uw'],
dtype='object')
loan_df['approval_type'].unique()
array(['nan', 'underwriting.auto.ftb_decision',
'underwriting.manual.approve', 'underwriting.manual_review',
'underwriting.manual.verification', 'underwriting.auto.deny',
'underwriting.manual.deny', 'underwriting.manual.override_approve',
'underwriting.auto.approve'], dtype=object)
loan_df['awaiting_payment_ts'].notnull().sum()
18275
loan_df['approval_type'].notnull().sum()
39012
B_repay = loan_df[['loan_id','awaiting_payment_ts','repayment_ts']]
B_repay = B_repay[B_repay['awaiting_payment_ts'].notnull()]
B_repay
| loan_id | awaiting_payment_ts | repayment_ts | |
|---|---|---|---|
| 2 | 71a279d5-5bce-4ea2-9e98-ec9d994256b1 | 2020-10-13 12:19:55.282708 | NaT |
| 8 | e8998cbb-37ab-4180-93e9-d22b7855665f | 2020-10-13 02:41:12.924726 | NaT |
| 9 | 0004ff34-d655-4f25-9104-6e108f454a13 | 2020-10-13 02:59:35.127466 | NaT |
| 11 | 6c4945bf-ed63-41ff-8fb8-641cb53310f3 | 2020-10-13 11:47:52.548097 | NaT |
| 20 | 471cb918-697c-4cb9-8846-fb075000b6d0 | 2020-10-13 06:01:33.888217 | NaT |
| ... | ... | ... | ... |
| 39007 | c538afec-eabb-442c-b53f-0d8163ba679c | 2020-10-24 04:56:45.707634 | 2020-10-29 13:42:10.992583 |
| 39008 | 8b6b254d-4a62-4579-b640-6a17d61cc76e | 2020-10-27 16:55:59.529787 | 2020-11-06 17:03:15.729864 |
| 39009 | 5e1b86e3-1f17-4413-9551-5207de5b29c5 | 2020-10-30 06:15:25.648940 | 2020-10-30 14:36:51.175908 |
| 39010 | 601ee268-b2da-41ea-9470-0c2574e7410c | 2020-11-04 01:14:37.253225 | 2020-11-13 15:05:15.799022 |
| 39011 | da8f8586-6ac5-40c5-bdda-8e570151f35f | 2020-11-06 02:21:11.054734 | 2020-11-20 15:25:37.351017 |
18275 rows × 3 columns
B_num_loan = B_repay[['repayment_ts']].notnull().sum().iloc[0]
print(f"There are {B_num_loan} loans enter repayment of approvals.")
There are 1499 loans enter repayment of approvals.
B_repayN = B_repay.dropna()
# Create an independent copy of the DataFrame slice
#By using A_repayN.copy(), you create a new DataFrame that is independent of the original one.
#Any modifications you make to this copy won't trigger the SettingWithCopyWarning,
#as it's clear to Pandas that A_repayN is a standalone DataFrame.
B_repayN = B_repayN.copy()
# Now perform your operation
B_repayN['timediff'] = B_repayN['repayment_ts'] - B_repayN['awaiting_payment_ts']
B_repayN['daysdiff'] = B_repayN['timediff'].dt.days
B_repay_result = B_repayN[B_repayN['daysdiff']<=15].reset_index()
B_repay_result
| index | loan_id | awaiting_payment_ts | repayment_ts | timediff | daysdiff | |
|---|---|---|---|---|---|---|
| 0 | 39 | 872a3fce-4e3a-4120-9803-d2ebbe3a24de | 2020-10-13 17:03:37.755547 | 2020-10-16 14:30:20.587989 | 2 days 21:26:42.832442 | 2 |
| 1 | 41 | 1fe4e816-a795-4117-b7b6-dea4b0770efb | 2020-10-13 17:07:39.660251 | 2020-10-23 13:57:59.591775 | 9 days 20:50:19.931524 | 9 |
| 2 | 58 | 1ce92996-7c10-4d57-a7bf-3cc7fff3d0fa | 2020-10-13 19:09:41.350623 | 2020-10-23 13:59:15.645653 | 9 days 18:49:34.295030 | 9 |
| 3 | 182 | b20fcf6c-7e69-4e3e-9f1a-5abc8bae5bbd | 2020-10-16 03:25:53.399806 | 2020-10-27 13:25:50.585331 | 11 days 09:59:57.185525 | 11 |
| 4 | 237 | b17c56d0-3be3-4f07-8eaf-a6c74f47a9c8 | 2020-10-15 18:47:17.028023 | 2020-10-23 13:59:28.740101 | 7 days 19:12:11.712078 | 7 |
| ... | ... | ... | ... | ... | ... | ... |
| 1045 | 39007 | c538afec-eabb-442c-b53f-0d8163ba679c | 2020-10-24 04:56:45.707634 | 2020-10-29 13:42:10.992583 | 5 days 08:45:25.284949 | 5 |
| 1046 | 39008 | 8b6b254d-4a62-4579-b640-6a17d61cc76e | 2020-10-27 16:55:59.529787 | 2020-11-06 17:03:15.729864 | 10 days 00:07:16.200077 | 10 |
| 1047 | 39009 | 5e1b86e3-1f17-4413-9551-5207de5b29c5 | 2020-10-30 06:15:25.648940 | 2020-10-30 14:36:51.175908 | 0 days 08:21:25.526968 | 0 |
| 1048 | 39010 | 601ee268-b2da-41ea-9470-0c2574e7410c | 2020-11-04 01:14:37.253225 | 2020-11-13 15:05:15.799022 | 9 days 13:50:38.545797 | 9 |
| 1049 | 39011 | da8f8586-6ac5-40c5-bdda-8e570151f35f | 2020-11-06 02:21:11.054734 | 2020-11-20 15:25:37.351017 | 14 days 13:04:26.296283 | 14 |
1050 rows × 6 columns
print(f"There are {len(B_repay_result)} loans enter repayment within 15 days of approvals.")
There are 1050 loans enter repayment within 15 days of approvals.
print(f"There are {A_num_loan} loans enter repayment of approvals.")
print(f"There are {len(A_repay_result)} loans enter repayment within 15 days of approvals.")
print(f"There are {B_num_loan} loans enter repayment of approvals.")
print(f"There are {len(B_repay_result)} loans enter repayment within 15 days of approvals.")
There are 1524 loans enter repayment of approvals. There are 814 loans enter repayment within 15 days of approvals. There are 1499 loans enter repayment of approvals. There are 1050 loans enter repayment within 15 days of approvals.
cancel_df = loan_df
cancel_df.groupby('cancellation_type')['cancellation_type'].count().sort_values(ascending = False).reset_index(name= 'count')
| cancellation_type | count | |
|---|---|---|
| 0 | automated.awaiting_payment | 9715 |
| 1 | automated.application_started | 7996 |
| 2 | automated.pending | 4455 |
| 3 | remorse | 3499 |
| 4 | nan | 2757 |
| 5 | qualify_test | 2345 |
| 6 | accident | 2065 |
| 7 | other_interests | 1649 |
| 8 | deposit_failure | 1229 |
| 9 | didnt_realize_direct_deposit | 688 |
| 10 | automated.verification | 563 |
| 11 | purchased_elsewhere | 507 |
| 12 | spending_limit_exceeded | 505 |
| 13 | magento.edit_order.cancel | 437 |
| 14 | duplicate_order | 418 |
| 15 | 1_covid_elevated_risk | 143 |
| 16 | automated.deactivate_user_request | 40 |
| 17 | deny | 1 |
loan_df
| loan_id | user_id | amount | number_of_payments | user_pinwheel_eligible_at_ap | approval_type | application_start_ts | application_complete_ts | awaiting_payment_ts | repayment_ts | canceled_ts | cancellation_type | risk_tier_at_uw | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 27f95ba4-e56f-4222-bc92-d73bf8d7d669 | beca750a-2041-4c26-8154-6aa85ae9b245 | 471.48 | 8.0 | 0 | nan | 2020-10-13 00:04:43.644990 | 2020-10-13 00:08:29.744502 | NaT | NaT | 2020-11-03 06:00:09.020005 | automated.verification | T0 |
| 1 | 040d9651-3ef2-46ed-9e28-6ad12bf00585 | 3401d86c-898b-4d6f-ba67-08b2bc005d5f | 309.98 | 8.0 | 0 | nan | 2020-10-13 00:24:50.475201 | NaT | NaT | NaT | 2020-10-27 09:00:09.906958 | automated.application_started | T0 |
| 2 | 71a279d5-5bce-4ea2-9e98-ec9d994256b1 | ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f | 523.98 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-10-13 00:26:48.749278 | 2020-10-13 00:27:53.698868 | 2020-10-13 12:19:55.282708 | NaT | 2021-02-11 06:30:08.285020 | automated.awaiting_payment | T0 |
| 3 | 706b5235-8424-4e91-b813-328dc5603e8e | 1fa64260-49bf-474e-8800-893c0c455a06 | 369.96 | 4.0 | 0 | nan | 2020-10-13 01:08:06.763585 | NaT | NaT | NaT | 2020-10-27 09:00:08.296637 | automated.application_started | T0 |
| 4 | d850645f-a7fe-4cd1-97db-8f1ac1364a67 | 90f815e9-c931-4d9e-aa79-f136bc5f227c | 479.98 | 8.0 | 0 | nan | 2020-10-13 01:10:40.530583 | NaT | NaT | NaT | 2020-10-27 09:00:08.090965 | automated.application_started | T0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39007 | c538afec-eabb-442c-b53f-0d8163ba679c | ff61dff9-88e4-4a85-85b9-2a912f156dc9 | 887.89 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-24 04:55:58.074156 | 2020-10-24 04:56:44.713389 | 2020-10-24 04:56:45.707634 | 2020-10-29 13:42:10.992583 | NaT | nan | T0 |
| 39008 | 8b6b254d-4a62-4579-b640-6a17d61cc76e | 880fb13c-70dc-4604-9199-6e158f7c4eda | 847.63 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-10-27 16:55:35.479453 | 2020-10-27 16:55:59.061944 | 2020-10-27 16:55:59.529787 | 2020-11-06 17:03:15.729864 | NaT | nan | T0 |
| 39009 | 5e1b86e3-1f17-4413-9551-5207de5b29c5 | fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 | 304.98 | 8.0 | 0 | underwriting.auto.ftb_decision | 2020-10-30 06:14:34.776961 | 2020-10-30 06:15:24.078435 | 2020-10-30 06:15:25.648940 | 2020-10-30 14:36:51.175908 | NaT | nan | T0 |
| 39010 | 601ee268-b2da-41ea-9470-0c2574e7410c | 54612ae8-64d4-426e-8c22-57d2791a0b86 | 478.96 | 16.0 | 1 | underwriting.auto.ftb_decision | 2020-11-04 01:13:43.302878 | 2020-11-04 01:14:35.764894 | 2020-11-04 01:14:37.253225 | 2020-11-13 15:05:15.799022 | NaT | nan | T0 |
| 39011 | da8f8586-6ac5-40c5-bdda-8e570151f35f | 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 | 647.94 | 8.0 | 1 | underwriting.auto.ftb_decision | 2020-11-06 02:20:08.042178 | 2020-11-06 02:21:09.263684 | 2020-11-06 02:21:11.054734 | 2020-11-20 15:25:37.351017 | NaT | nan | T0 |
39012 rows × 13 columns
approve_df = loan_df
approve_df.groupby('approval_type')['approval_type'].count().sort_values(ascending = False).reset_index(name= 'count')
| approval_type | count | |
|---|---|---|
| 0 | underwriting.auto.ftb_decision | 17761 |
| 1 | nan | 15707 |
| 2 | underwriting.manual_review | 2740 |
| 3 | underwriting.manual.verification | 1106 |
| 4 | underwriting.auto.deny | 920 |
| 5 | underwriting.manual.approve | 662 |
| 6 | underwriting.manual.deny | 109 |
| 7 | underwriting.manual.override_approve | 4 |
| 8 | underwriting.auto.approve | 3 |
other = user_df[['user_id','signup_dt']]
other = other.copy()
# Assuming 'other' is your DataFrame and 'signup_dt' is your datetime column
other['year'] = other['signup_dt'].dt.year
other['month'] = other['signup_dt'].dt.month
other['day'] = other['signup_dt'].dt.day
other['date'] = pd.to_datetime(other[['year', 'month', 'day']])
other_n = other.groupby(['date'])['signup_dt'].count().reset_index(name = 'frequency')
other_n
| date | frequency | |
|---|---|---|
| 0 | 2020-10-13 | 2674 |
| 1 | 2020-10-14 | 3273 |
| 2 | 2020-10-15 | 3353 |
| 3 | 2020-10-16 | 2999 |
| 4 | 2020-10-17 | 3538 |
| 5 | 2020-10-18 | 3401 |
| 6 | 2020-10-19 | 3497 |
| 7 | 2020-10-20 | 3270 |
| 8 | 2020-10-21 | 3410 |
| 9 | 2020-10-22 | 3644 |
| 10 | 2020-10-23 | 3366 |
| 11 | 2020-10-24 | 3447 |
| 12 | 2020-10-25 | 3663 |
| 13 | 2020-10-26 | 3853 |
| 14 | 2020-10-27 | 5800 |
| 15 | 2020-10-28 | 5370 |
| 16 | 2020-10-29 | 4386 |
| 17 | 2020-10-30 | 3951 |
| 18 | 2020-10-31 | 3747 |
| 19 | 2020-11-01 | 4070 |
| 20 | 2020-11-02 | 4037 |
| 21 | 2020-11-03 | 4434 |
| 22 | 2020-11-04 | 3995 |
| 23 | 2020-11-05 | 4162 |
| 24 | 2020-11-06 | 4235 |
# Create a line plot
sns.lineplot(data=other_n, x='date', y='frequency')
# Finding the highest two points
top_two = other_n.nlargest(2, 'frequency')
# Annotating the highest two points with date and frequency
for _, row in top_two.iterrows():
label = f"{row['date'].strftime('%Y-%m-%d')}, {row['frequency']}"
plt.annotate(label, (row['date'], row['frequency']), textcoords="offset points", xytext=(50,1), ha='center')
# Highlighting the highest two points with dots
plt.scatter(top_two['date'], top_two['frequency'], color='red', s=50) # 's' is the size of the dot
# Enhancing the plot
plt.title('Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
# Show the plot
plt.show()
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
##time of the day??
other
| user_id | signup_dt | year | month | day | date | |
|---|---|---|---|---|---|---|
| 0 | 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc | 2020-10-13 00:06:21.603537 | 2020 | 10 | 13 | 2020-10-13 |
| 1 | 2752a3f2-0a96-42f9-bcc3-5fde7edec06b | 2020-10-13 00:12:20.590843 | 2020 | 10 | 13 | 2020-10-13 |
| 2 | 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 | 2020-10-13 00:15:26.586054 | 2020 | 10 | 13 | 2020-10-13 |
| 3 | 0067c279-3b94-4867-ae51-aa182cfc8dc3 | 2020-10-13 00:17:16.041499 | 2020 | 10 | 13 | 2020-10-13 |
| 4 | 972e17fd-95b5-4bcc-8f17-c4323abd6c7b | 2020-10-13 00:21:46.310764 | 2020 | 10 | 13 | 2020-10-13 |
| ... | ... | ... | ... | ... | ... | ... |
| 95570 | 9d562896-402f-43fb-b2e8-95674b59c00c | 2020-10-19 00:03:58.651384 | 2020 | 10 | 19 | 2020-10-19 |
| 95571 | 2db26318-1e86-4611-8343-f22c2f52f700 | 2020-10-27 17:36:53.948341 | 2020 | 10 | 27 | 2020-10-27 |
| 95572 | 19444a27-e573-4b06-9ebf-f49a80cb27ba | 2020-11-05 13:43:05.890775 | 2020 | 11 | 5 | 2020-11-05 |
| 95573 | b2cf7001-2145-4653-a58f-26f658d6fed0 | 2020-11-01 22:49:39.533347 | 2020 | 11 | 1 | 2020-11-01 |
| 95574 | a9b1e320-847f-438f-b9c6-d3b374810d6c | 2020-10-24 15:36:18.286066 | 2020 | 10 | 24 | 2020-10-24 |
95575 rows × 6 columns
# Assuming 'other' is your DataFrame and 'signup_dt' is your datetime column
other['Hour'] = other['signup_dt'].dt.hour
other['minute'] = other['signup_dt'].dt.minute
other['seconds'] = other['signup_dt'].dt.second
#other['time'] = pd.to_datetime(other[['hour', 'minute', 'seconds']])
other_t = other.groupby(['Hour'])['signup_dt'].count().reset_index(name = 'Frequency')
other_t
| Hour | Frequency | |
|---|---|---|
| 0 | 0 | 5275 |
| 1 | 1 | 5461 |
| 2 | 2 | 5679 |
| 3 | 3 | 5220 |
| 4 | 4 | 4225 |
| 5 | 5 | 3217 |
| 6 | 6 | 1939 |
| 7 | 7 | 1892 |
| 8 | 8 | 1988 |
| 9 | 9 | 1744 |
| 10 | 10 | 1762 |
| 11 | 11 | 2074 |
| 12 | 12 | 2713 |
| 13 | 13 | 3864 |
| 14 | 14 | 4402 |
| 15 | 15 | 4614 |
| 16 | 16 | 4854 |
| 17 | 17 | 5211 |
| 18 | 18 | 5039 |
| 19 | 19 | 5079 |
| 20 | 20 | 4864 |
| 21 | 21 | 4846 |
| 22 | 22 | 4661 |
| 23 | 23 | 4952 |
# Create a line plot
sns.lineplot(data=other_t, x='Hour', y='Frequency')
plt.title('Frequency Over Hour')
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
Text(0.5, 1.0, 'Frequency Over Hour')